import os
import sys
from Bio import SeqIO
from pylab import *


target = sys.argv[1]

directory = "/osc-fs_home/mdehoon/Data/CASPARs/Filters/"
filename = "%s.fa" % target
path = os.path.join(directory, filename)
records = SeqIO.parse(path, "fasta")
lengths = []
for record in records:
    length = len(record.seq)
    lengths.append(length)

lengths = array(lengths)
hist(lengths, bins=1000)
xlim(0, 10000)

print("Mean %s transcript length: %d" % (target, mean(lengths)))
print("Median %s transcript length: %d" % (target, median(lengths)))
percentage = 100 * sum(lengths > 1000) / len(lengths)
print("%.2f%% of %s transcript are longer than 1000 nucleotides" % (percentage, target))

xlabel("transcript length [nt]")
ylabel("number of %s transcripts" % target)
